library(data.table)
library(ggplot2)
library(caret)
library(nnet)
library(pls)
library(FactoMineR)
library(qqman)
library(mixOmics)
import des datas
data(Phenotype)
Warning: data set ‘Phenotype’ not found
data(Genomic)
Warning: data set ‘Genomic’ not found
data("localisation")
Warning: data set ‘localisation’ not found
pheno <- as.data.table(Phenotype)
loc_df<- as.data.table(localisation)
geno<- as.data.table(Genomic)
geno[, ID := rownames(Genomic)]
pheno[, ID := rownames(Phenotype)]
merged<- merge(pheno, loc_df[,list(Population,ID)], by = "ID")
#merged<- merge(merged, geno, by = "ID")
LM (Pheno ~ population )
df <- as.data.table(merged)
phenos <- names(df)[sapply(df, is.numeric)]
phenos <- setdiff(phenos, c("class_index","Lat","Lgn","ID"))
phenos
[1] "HT2009" "CIRC2009" "HT2011" "CIRC2011" "H"
[6] "G" "S" "H_G" "S_G" "Klason_lignin"
[11] "Glucose" "Xyl_Glu" "C5_C6" "Wet_chem_extractives" "Dia2015_sqrt"
[16] "InfraDens" "Rust" "BrAnglVert" "RamifSyllep" "BudFlushSlope"
get_r2 <- function(var) {
form<- as.formula(paste(var, "~ Population"))
mod<- lm(form, data = df)
summary(mod)$r.squared
}
r2_values<- data.table(
phenotype = phenos,
r2= sapply(phenos, get_r2)
)
ggplot(r2_values, aes(x = reorder(phenotype, r2), y = r2)) +
geom_point(size = 3) +
geom_segment(aes(x = phenotype, xend = phenotype, y = 0, yend = r2)) +
coord_flip() +
ylab("R² lm(Phénotype ~ Population)") +
xlab("Caractère phénotypique") +
theme_minimal()

khi 2 snp ~ pop
table cont snp chr13 et pop => haplo pour différentier la pop
Data visualisation
Groupes de variables : chromosomes, phénotypes, populations
constructions des axes : chromosomes => on sépare les individus
selon leur distances génomiques et on projette en supplémentaires les
phénotypes / population
on pourra voir quels chromosomes ne sont pas dutout corrélés avec des
traits phénotypiques
Réccupération des positions :
merged<- merge(merged, geno, by = "ID")
geno_mat <- as.matrix(geno)
coords <- do.call(rbind, strsplit(colnames(geno), "_"))
Warning: number of columns of result is not a multiple of vector length (arg 1)
chrom <- as.factor(coords[,1])
pos <- as.numeric(coords[,2])
Warning: NAs introduced by coercion
PCA
pca <- PCA(merged[,2:217022], quali.sup=21 ,quanti.sup=(2:20) )


plot(pca,choix="ind",habillage=21)

plot(pca,choix="var", #invisible = c("quali","var"),
label = c("quanti.sup"),
select = "coord 10" )
MixOmic
sPCA
X<- as.matrix(merged[,23:217022])
X2 <- X[, apply(X, 2, sd) != 0]
Y <- as.matrix(merged$CIRC2011)
result.spca.multi <- spca(X2, keepX = c(50, 30))
plotIndiv(result.spca.multi)

plotVar(result.spca.multi)
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
Please use tidy evaluation idioms with `aes()`.
See also `vignette("ggplot2-in-packages")` for more information.

# extract the variables used to construct the first PC
selectVar(result.spca.multi, comp = 1)$name
[1] "Chr02_6422161" "Chr02_6427114" "Chr02_6427264" "Chr02_6416381" "Chr13_15625954" "Chr13_15574007"
[7] "Chr13_15627568" "Chr07_7181966" "Chr13_15626544" "Chr02_6382504" "Chr13_15635566" "Chr02_6422861"
[13] "Chr02_6423252" "Chr02_6426926" "Chr12_3267410" "Chr12_3264433" "Chr12_3267439" "Chr12_3264479"
[19] "Chr10_20127228" "Chr01_42022246" "Chr17_7247760" "Chr10_20121818" "Chr10_20126974" "Chr17_5836683"
[25] "Chr02_6325961" "Chr03_18516527" "Chr06_22417885" "Chr02_6332720" "Chr11_17347389" "Chr06_24641715"
[31] "Chr11_17347361" "Chr14_13405721" "Chr11_17347960" "Chr01_14722117" "Chr06_24641246" "Chr03_11418460"
[37] "Chr03_11422148" "Chr01_14722377" "Chr03_15711505" "Chr11_4789295" "Chr01_14717384" "Chr15_9806504"
[43] "Chr17_14258124" "Chr17_14265429" "Chr07_12991228" "Chr07_12991382" "Chr01_8617466" "Chr06_20423555"
[49] "Chr07_12997322" "Chr01_14708394"
# depict weight assigned to each of these variables
plotLoadings(result.spca.multi, method = 'mean', contrib = 'max')

block PLS
pls.result <- mixOmics::pls(X, Y)
Warning: the standard deviation is zero
plotIndiv(pls.result)

plotIndiv(pls.result, group = merged$Population,
rep.space = 'XY-variate',
ellipse = TRUE, # plot using the ellipses
legend = TRUE)

block sPLS
head(merged[,2:21])
Y <- as.matrix(merged[,2:21])
spls.result <- mixOmics::spls(X2, Y, ncomp=5, keepX = c(rep(1000,5)))
plotIndiv(spls.result, group = merged$Population,
rep.space = 'XY-variate',
ellipse = TRUE, # plot using the ellipses
legend = TRUE)

# Identifier les coefficients non nuls
nonzero_idx <- loadX != 0
Error: object 'loadX' not found
manhattan plot
manhattan(merged_nonzero, chr = "1", bp = "2", p = "loading",snp = "SNP",
col = c("blue4", "orange3"),
cex = 0.6) # taille des points
block PLS_DA
# use the mirna, mrna and protein expression levels as predictive datasets
# note that each dataset is measured across the same individuals (samples)
X1<- as.matrix(merged[,23:217022])
X1 <- X1[, apply(X1, 2, sd) != 0]
X2 <- as.matrix(merged[,2:21])
Y <- as.factor(merged$Population)
X <- list(geno = X1, pheno = X2)
result.diablo.tcga <- block.plsda(X, Y) # run the method
plotIndiv(result.diablo.tcga,
rep.space = 'XY-variate',
ellipse = TRUE, # plot using the ellipses
legend = TRUE)

Block sPLS DA
# set the number of features to use for the X datasets
list.keepX = list(geno = c(rep(10000,5)), pheno = c(rep(20,5)))
# run the method
result.sparse.diablo.tcga <- block.splsda(X, Y, keepX = list.keepX, ncomp = 5)
plotLoadings(result.sparse.diablo.tcga, ncomp = 5)

Chr13_15574007 ressort encore
plotIndiv(result.sparse.diablo.tcga,ellipse = TRUE,legend = TRUE)

plotVar(result.sparse.diablo.tcga) # plot the variables

Boostrapped sPLS
library(spls)
y <- merged$CIRC2009
y <- as.numeric(y)
X <- as.matrix(merged[,23:217022])
# Split stratifié
# Ici, on fait un split simple car phénotype continu
train_index <- createDataPartition(y, p = 0.9, list = FALSE)
X_train <- X[train_index, ]
var_threshold <- 1e-8
nonzero_cols <- apply(X_train, 2, var) > var_threshold
length(nonzero_cols)
X_train <- X_train[, nonzero_cols]
X_test <- X[-train_index, ]
X_test <- X_test[, nonzero_cols]
y_train <- y[train_index]
y_test <- y[-train_index]
eta = seq(0.1,0.9,0.1)
K = c(1:5)
cv.spls( X_train, y_train, fold=5, K, eta, kappa=0.5, select="pls2", fit="simpls",scale.x=FALSE, scale.y=FALSE, plot.it=TRUE )
model <- spls(X_train, y_train, K = 4, eta =0.2 , kappa=0.5, select="pls2", fit="simpls",
scale.x=FALSE, scale.y=FALSE, eps=1e-4, maxstep=100, trace=FALSE)
ped <- predict.spls( model, X_test, type = c("fit","coefficient"))
ped
f <- spls::ci.spls( model, coverage=0.95, B=1000,
plot.it=TRUE, plot.fix="y",
plot.var=NA, K=model$K, fit=model$fit )
names(f)
nonzero_idx <- which(f$betahat != 0)
betahat_nonzero <- f$betahat[nonzero_idx]
length(nonzero_idx)
MFA
res = MFA(geno_mat, group=c(25974,16646,12284,11790,14769,17004,8327,14040,9840,15964,7483,7403,8040,11123,8486,7629,6747,8267,4876,308), type=c(rep("s",20)), ncp=5, name.group=c("Chr01","Chr02","Chr03","Chr04", "Chr05","Chr06","Chr07","Chr08","Chr09","Chr10", "Chr11","Chr12","Chr13", "Chr14","Chr15","Chr16","Chr17","Chr18","Chr19", "scaffold"))
dim(geno_mat)
PLS : Phenotype ~ all SNP
inner_train_index <- createDataPartition(y_train, p = 0.9, list = FALSE)
X_train <- X_train_df[inner_train_index, , drop = FALSE]
y_train <- y_train[inner_train_index]
X_test <- X_train_df[-inner_train_index, , drop = FALSE]
y_test <- y_train[-inner_train_index]
df_train <- data.frame(y = y_train, X_train_df)
pls_final <- plsr(y ~ ., data = df_train, ncomp = ncomp_opt, validation = "none")
# Évaluation sur le fold externe
y_pred_final <- as.vector(predict(pls_final, newdata = X_test_df, ncomp = ncomp_opt))
PLS Subset SNP | calcul de VIP | nested CV
\[VIP_{j} = \sqrt{ p \frac{\displaystyle
\sum_{h=1}^{A} SS_{Y,h}\frac{w_{jh}^{2}}{\lVert w_{h}
\rVert^{2}}}{\displaystyle \sum_{h=1}^{A} SS_{Y,h}} }\]
set.seed(123)
# Paramètres
n_iter <- 400
subset_size <- 10000 # nombre de SNP aléatoires par itération
ncomp_candidates <- 1:5 # candidats pour ncomp
nfold_outer <- 5
nfold_inner <- 10
results_perf
vip_perf<- results_perf[, .(R2_test_mean = mean(R2_test)), by = iter]
vip_perf_fold<- results_perf[, .(R2_test_mean = mean(R2_test)), by = fold]
ggplot(results_perf[1:1000,], aes(x = factor(iter), y = R2_test)) +
geom_violin(fill = "skyblue", color = "black") +
labs(x = "Itération", y = expression(R^2), title = "R² 10-outer-fold ") +
theme_minimal()
mean(vip_perf_fold$R2_test_mean)
vip_perf_fold
library(data.table)
library(ggplot2)
# Calcul du VIP moyen par SNP
vip_mean<- results_vip[, .(VIP_mean = mean(VIP),SNP_count = .N), by = SNP]
# Sélection des 30 SNP ayant le VIP moyen le plus élevé
topN<- vip_mean[order(-VIP_mean)][0:5000, SNP]
df_topNPLS<- results_vip[SNP %in% topN]
df_topNPLS <- merge(df_topNPLS, vip_mean[, .(SNP, SNP_count)], by = "SNP")
df_topNPLS$SNP_count <- df_topNPLS$SNP_count/10
ggplot(df_topNPLS, aes(x = reorder(SNP,VIP), y = VIP, fill = SNP_count)) +
geom_violin(trim = FALSE) +
geom_boxplot(width = 0.1, outlier.size = 0.5) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
) +
labs(
x = "SNP",
y = "VIP",
title = "Distribution du VIP pour les n = 50 SNP les plus importants",
fill = "Nbr de selection du SNP"
)
hist(vip_mean$VIP_mean, breaks = seq(min(vip_mean$VIP_mean), max(vip_mean$VIP_mean), length.out = 50),
main = "Distribution des scores VIP moyens",
xlab = "scores VIP moyens", col = "lightblue", border = "white")
Lasso
library(glmnet)
# Assurer que y est numérique et X est une matrice
y <- as.numeric(pheno$CIRC2009)
X <- as.matrix(geno)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]
# Ajustement du Lasso avec validation croisée pour choisir lambda
set.seed(123)
cv_lasso <- cv.glmnet(X_train, y_train, alpha = 1, nfolds = 5)
# Afficher la meilleure valeur de lambda
best_lambda <- cv_lasso$lambda.min
print(best_lambda)
# Tracer l'erreur de validation croisée
plot(cv_lasso)
# Ajuster le modèle final avec lambda optimal
lasso_model <- glmnet(X_train, y_train, alpha = 1, lambda = best_lambda)
# Extraire les coefficients
coef(lasso_model)
print(best_lambda)
y_pred <- predict(lasso_model, newx = X_test)
# Calcul du R²
SSE <- sum((y_test - y_pred)^2) # somme des carrés des erreurs
SST <- sum((y_test - mean(y))^2) # somme totale des carrés
R2 <- 1 - SSE/SST
print(sqrt(SSE))
R2
coef_df <- data.frame(
SNP = rownames(coef(lasso_model)),
Coefficient = as.numeric(coef(lasso_model))
)
coef_nonzero <- coef_df[coef_df$Coefficient != 0, ]
coef_nonzero_dt <- as.data.table(coef_nonzero)
# Top N SNP par valeur absolue des coefficients
N <- 50
coef_nonzero_dt[, abs_coef := abs(Coefficient)]
topN_Lasso <- coef_nonzero_dt[order(-abs_coef)][1:N, .(SNP, Coefficient)]
topN_Lasso
Comparaison selection Lasso et PLS
common_snps <- intersect(df_topNPLS$SNP, coef_nonzero$SNP)
# SNP communs
common_snps
length(common_snps)
réegression ridge adaptative
y <- as.numeric(pheno$CIRC2009)
X <- as.matrix(geno)
# Split externe train/test
set.seed(123)
train_index <- createDataPartition(y, p = 0.9, list = FALSE)
X_train <- X[train_index, ]
X_test <- X[-train_index, ]
y_train <- y[train_index]
y_test <- y[-train_index]
# Split interne du train en D1 et D2 : 50/50
set.seed(123)
idx_D1 <- createDataPartition(y_train, p = 0.5, list = FALSE)
X_D1 <- X_train[idx_D1, ]
X_D2 <- X_train[-idx_D1, ]
y_D1 <- y_train[idx_D1]
y_D2 <- y_train[-idx_D1]
STEP I : SCREENING (Elastic Net sur D1)
alpha <- 1 # Elastic Net si alpha = 0.5, Lasso si = 1, ridge si =0
set.seed(123)
cv_enet <- cv.glmnet(
X_D1, y_D1,
alpha = alpha,
nfolds = 10,
standardize = TRUE
)
lambda_hat <- cv_enet$lambda.min
lambda_hat
# Ajustement final
enet_model <- glmnet(X_D1, y_D1, alpha = alpha, lambda = 0.002)
coefs <- coef(enet_model)
selected <- which(coefs[-1] != 0) # indices des SNP sélectionnés
S_hat <- selected
cat("Nombre de variables sélectionnées (screening):", length(S_hat), "\n")
# Poids dérivés des coefficients Elastic Net
beta_hat <- as.numeric(coefs[-1])
weights <- abs(beta_hat)
weights <- weights[S_hat]
weights <- weights / max(weights) # normalisation
lambda_hat <- 0.002
STEP II : CLEANING (Ridge sur D2)
X_D2_sel <- X_D2[, S_hat, drop = FALSE]
# Pondération Ridge : mettre plus de pénalité sur les petites β_EN
ridge_penalty <- 1 / (weights + 1e-6)
set.seed(123)
cv_ridge <- cv.glmnet(
X_D2_sel, y_D2,
alpha = 0, # alpha = 0 → Ridge
nfolds = 10,
penalty.factor = ridge_penalty
)
lambda_ridge <- cv_ridge$lambda.min
# Ajustement final sur D2
ridge_model <- glmnet(
X_D2_sel, y_D2,
alpha = 0,
lambda = lambda_ridge,
penalty.factor = ridge_penalty
)
# Coefficients ridge finaux (sur D2)
beta_clean <- coef(ridge_model)
STEP III : REFIT FINAL SUR D = D1 ∪ D2 (train complet)
X_train_sel <- X_train[, S_hat, drop = FALSE]
ridge_final <- glmnet(
X_train_sel, y_train,
alpha = 0,
lambda = lambda_ridge,
penalty.factor = ridge_penalty
)
# Prédictions sur le test externe
X_test_sel <- X_test[, S_hat, drop = FALSE]
y_pred <- predict(ridge_final, newx = X_test_sel)
# R² externe
R2_test <- 1 - sum((y_test - y_pred)^2) / sum((y_test - mean(y_test))^2)
cat("R2 externe :", R2_test, "\n")
---
title: "Clustering LD phenotypique based"
format: html
output: html_notebook
editor: visual
---

```{r}
library(data.table)
library(ggplot2)
library(caret)
library(nnet)
library(pls)
library(FactoMineR)
library(qqman)
library(mixOmics)

```

# import des datas

```{r}
data(Phenotype)
data(Genomic)
data("localisation")
```

```{r}
pheno <- as.data.table(Phenotype)
loc_df<- as.data.table(localisation)
geno<- as.data.table(Genomic)
geno[, ID := rownames(Genomic)]
pheno[, ID := rownames(Phenotype)]

merged<- merge(pheno, loc_df[,list(Population,ID)], by = "ID")

```

# LM (Pheno \~ population )

```{r}
df <- as.data.table(merged)
```



```{r}
phenos <- names(df)[sapply(df, is.numeric)]
phenos <- setdiff(phenos, c("class_index","Lat","Lgn","ID"))
```

```{r}
phenos
```


```{r}
get_r2 <- function(var) {
  form<- as.formula(paste(var, "~ Population"))
  mod<- lm(form, data = df)
  summary(mod)$r.squared
}

r2_values<- data.table(
  phenotype = phenos,
  r2= sapply(phenos, get_r2))
```

```{r}

ggplot(r2_values, aes(x = reorder(phenotype, r2), y = r2)) +
  geom_point(size = 3) +
  geom_segment(aes(x = phenotype, xend = phenotype, y = 0, yend = r2)) +
  coord_flip() +
  ylab("R² lm(Phénotype ~ Population)") +
  xlab("Caractère phénotypique") +
  theme_minimal()
```
khi 2 snp ~ pop 

table cont snp chr13 et pop => haplo pour différentier la pop 

# Data visualisation

Groupes de variables : chromosomes, phénotypes, populations

constructions des axes : chromosomes =\> on sépare les individus selon leur distances génomiques et on projette en supplémentaires les phénotypes / population

on pourra voir quels chromosomes ne sont pas dutout corrélés avec des traits phénotypiques

# Réccupération des positions :

```{r}
merged<- merge(merged, geno, by = "ID")
geno_mat <- as.matrix(geno)
coords <- do.call(rbind, strsplit(colnames(geno), "_"))
chrom <- as.factor(coords[,1])
pos <- as.numeric(coords[,2])
```

## PCA

```{r}
pca <- PCA(merged[,2:217022], quali.sup=21 ,quanti.sup=(2:20) )
```

```{r}
plot(pca,choix="ind",habillage=21)
```

```{r}
plot(pca,choix="var", #invisible = c("quali","var"),
     label = c("quanti.sup"),
     select = "coord 10" )
```

## MixOmic

### sPCA

```{r}
X<- as.matrix(merged[,23:217022])
X2 <- X[, apply(X, 2, sd) != 0]

Y <- as.matrix(merged$CIRC2011)

result.spca.multi <- spca(X2, keepX = c(50, 30))

plotIndiv(result.spca.multi)  
plotVar(result.spca.multi)   

# extract the variables used to construct the first PC
selectVar(result.spca.multi, comp = 1)$name 
# depict weight assigned to each of these variables
plotLoadings(result.spca.multi, method = 'mean', contrib = 'max')
```

### block PLS

```{r}
pls.result <- mixOmics::pls(X, Y) 
plotIndiv(pls.result) 
```

```{r}
plotIndiv(pls.result, group = merged$Population, 
          rep.space = 'XY-variate', 
          ellipse = TRUE,  # plot using the ellipses
          legend = TRUE)
```

### block sPLS

```{r}
head(merged[,2:21])
```

```{r}
Y <- as.matrix(merged[,2:21])

spls.result <- mixOmics::spls(X2, Y, ncomp=5, keepX = c(rep(1000,5))) 

plotIndiv(spls.result, group = merged$Population, 
          rep.space = 'XY-variate', 
          ellipse = TRUE,  # plot using the ellipses
          legend = TRUE)
```

```{r}
# Identifier les coefficients non nuls
nonzero_idx <- loadX != 0
df_nonzero_X <- data.frame(
  SNP = names(loadX)[nonzero_idx],
  loading  = loadX[nonzero_idx]
)
coords <- do.call(rbind, strsplit(rownames(df_nonzero_X), "_"))
coords[,1] <- as.numeric(sub("Chr", "", coords[,1]))
pos <- as.numeric(coords[,2])
merged_nonzero <- cbind(coords,df_nonzero_X)
merged_nonzero$"2" <- as.numeric(merged_nonzero$"2")
merged_nonzero$"1" <- as.numeric(merged_nonzero$"1")
merged_nonzero$loading <- abs(merged_nonzero$loading)
```

# manhattan plot

```{r}
manhattan(merged_nonzero, chr = "1", bp = "2", p = "loading",snp = "SNP",
          col = c("blue4", "orange3"), 
          cex = 0.6)  # taille des points
```

### block PLS_DA

```{r}
# use the mirna, mrna and protein expression levels as predictive datasets
# note that each dataset is measured across the same individuals (samples)

X1<- as.matrix(merged[,23:217022])
X1 <- X1[, apply(X1, 2, sd) != 0]
X2 <- as.matrix(merged[,2:21])

Y <- as.factor(merged$Population)

X <- list(geno = X1, pheno = X2)

```

```{r}
result.diablo.tcga <- block.plsda(X, Y) # run the method

plotIndiv(result.diablo.tcga,
          rep.space = 'XY-variate', 
          ellipse = TRUE,  # plot using the ellipses
          legend = TRUE)
```

### Block sPLS DA

```{r}
# set the number of features to use for the X datasets
list.keepX = list(geno = c(rep(10000,5)), pheno = c(rep(20,5)))
# run the method
result.sparse.diablo.tcga <-  block.splsda(X, Y, keepX = list.keepX, ncomp = 5)
```

```{r}
plotLoadings(result.sparse.diablo.tcga, ncomp = 5)
```

Chr13_15574007 ressort encore

```{r}
plotIndiv(result.sparse.diablo.tcga,ellipse = TRUE,legend = TRUE) 
```

```{r}
plotVar(result.sparse.diablo.tcga) # plot the variables
```

### Boostrapped sPLS

```{r}
library(spls)
y <- merged$CIRC2009  
y <- as.numeric(y)
X <- as.matrix(merged[,23:217022])
```

```{r}
# Split stratifié 
# Ici, on fait un split simple car phénotype continu
train_index <- createDataPartition(y, p = 0.9, list = FALSE)

X_train <- X[train_index, ]
var_threshold <- 1e-8

nonzero_cols <- apply(X_train, 2, var) > var_threshold

length(nonzero_cols)
X_train <- X_train[, nonzero_cols]

X_test  <- X[-train_index, ]
X_test  <- X_test[, nonzero_cols]


y_train <- y[train_index]

y_test  <- y[-train_index]
```

```{r}
eta = seq(0.1,0.9,0.1)
K = c(1:5)
cv.spls( X_train, y_train, fold=5, K, eta, kappa=0.5, select="pls2", fit="simpls",scale.x=FALSE, scale.y=FALSE, plot.it=TRUE )
```

```{r}
model <- spls(X_train, y_train, K = 4, eta =0.2 , kappa=0.5, select="pls2", fit="simpls",
scale.x=FALSE, scale.y=FALSE, eps=1e-4, maxstep=100, trace=FALSE)
```

```{r}
ped <- predict.spls( model, X_test, type = c("fit","coefficient"))
```

```{r}
ped
```

```{r}
f <- spls::ci.spls( model, coverage=0.95, B=1000,
                    plot.it=TRUE, plot.fix="y",
                    plot.var=NA, K=model$K, fit=model$fit )


```

```{r}
names(f)
```

```{r}
nonzero_idx <- which(f$betahat != 0)
betahat_nonzero <- f$betahat[nonzero_idx]
```

```{r}
length(nonzero_idx)
```

## MFA

```{r}
res = MFA(geno_mat, group=c(25974,16646,12284,11790,14769,17004,8327,14040,9840,15964,7483,7403,8040,11123,8486,7629,6747,8267,4876,308), type=c(rep("s",20)), ncp=5, name.group=c("Chr01","Chr02","Chr03","Chr04", "Chr05","Chr06","Chr07","Chr08","Chr09","Chr10", "Chr11","Chr12","Chr13", "Chr14","Chr15","Chr16","Chr17","Chr18","Chr19", "scaffold"))
```

```{r}
dim(geno_mat)
```

# PLS : Phenotype \~ all SNP

```{r}
inner_train_index <- createDataPartition(y_train, p = 0.9, list = FALSE)
X_train <- X_train_df[inner_train_index, , drop = FALSE]
y_train <- y_train[inner_train_index]
X_test <- X_train_df[-inner_train_index, , drop = FALSE]
y_test <- y_train[-inner_train_index]
```


```{r}
df_train <- data.frame(y = y_train, X_train_df)
pls_final <- plsr(y ~ ., data = df_train, ncomp = ncomp_opt, validation = "none")
  
  # Évaluation sur le fold externe 
  
y_pred_final <- as.vector(predict(pls_final, newdata = X_test_df, ncomp = ncomp_opt))
```


# PLS Subset SNP \| calcul de VIP \| nested CV

$$VIP_{j} = \sqrt{ p \frac{\displaystyle \sum_{h=1}^{A} SS_{Y,h}\frac{w_{jh}^{2}}{\lVert w_{h} \rVert^{2}}}{\displaystyle \sum_{h=1}^{A} SS_{Y,h}} }$$

```{r}
set.seed(123)
# Paramètres
n_iter <- 400
subset_size <- 10000     # nombre de SNP aléatoires par itération
ncomp_candidates <- 1:5  # candidats pour ncomp
nfold_outer <- 5
nfold_inner <- 10
```

```{r}
results_perf
vip_perf<- results_perf[, .(R2_test_mean = mean(R2_test)), by = iter]
vip_perf_fold<- results_perf[, .(R2_test_mean = mean(R2_test)), by = fold]
```

```{r}
ggplot(results_perf[1:1000,], aes(x = factor(iter), y = R2_test)) +
  geom_violin(fill = "skyblue", color = "black") +
  labs(x = "Itération", y = expression(R^2), title = "R² 10-outer-fold ") +
  theme_minimal()
```

```{r}
mean(vip_perf_fold$R2_test_mean)
```

```{r}
vip_perf_fold
```

```{r}
library(data.table)
library(ggplot2)


# Calcul du VIP moyen par SNP
vip_mean<- results_vip[, .(VIP_mean = mean(VIP),SNP_count = .N), by = SNP]
```

```{r}
# Sélection des 30 SNP ayant le VIP moyen le plus élevé
topN<- vip_mean[order(-VIP_mean)][0:5000, SNP]

df_topNPLS<- results_vip[SNP %in% topN]
df_topNPLS <- merge(df_topNPLS, vip_mean[, .(SNP, SNP_count)], by = "SNP")
df_topNPLS$SNP_count <- df_topNPLS$SNP_count/10

```

```{r}
ggplot(df_topNPLS, aes(x = reorder(SNP,VIP), y = VIP, fill = SNP_count)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.1, outlier.size = 0.5) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
  ) +
  labs(
    x = "SNP",
    y = "VIP",
    title = "Distribution du VIP pour les n = 50 SNP les plus importants",
    fill = "Nbr de selection du SNP"
  )
```

```{r}
hist(vip_mean$VIP_mean, breaks = seq(min(vip_mean$VIP_mean), max(vip_mean$VIP_mean), length.out = 50),
     main = "Distribution des scores VIP moyens",
     xlab = "scores VIP moyens", col = "lightblue", border = "white")
```

# Lasso

```{r}
library(glmnet)

# Assurer que y est numérique et X est une matrice
y <- as.numeric(pheno$CIRC2009)
X <- as.matrix(geno)

train_index <- createDataPartition(y, p = 0.8, list = FALSE)
X_train <- X[train_index, ]
X_test  <- X[-train_index, ]
y_train <- y[train_index]
y_test  <- y[-train_index]

# Ajustement du Lasso avec validation croisée pour choisir lambda
set.seed(123)

cv_lasso <- cv.glmnet(X_train, y_train, alpha = 1, nfolds = 5)

# Afficher la meilleure valeur de lambda
best_lambda <- cv_lasso$lambda.min
print(best_lambda)

# Tracer l'erreur de validation croisée
plot(cv_lasso)

# Ajuster le modèle final avec lambda optimal
lasso_model <- glmnet(X_train, y_train, alpha = 1, lambda = best_lambda)

# Extraire les coefficients
coef(lasso_model)
```

```{r}
print(best_lambda)
```

```{r}
y_pred <- predict(lasso_model, newx = X_test)

# Calcul du R²
SSE <- sum((y_test - y_pred)^2)           # somme des carrés des erreurs
SST <- sum((y_test - mean(y))^2)         # somme totale des carrés
R2 <- 1 - SSE/SST
print(sqrt(SSE))
R2
```

```{r}
coef_df <- data.frame(
  SNP = rownames(coef(lasso_model)),
  Coefficient = as.numeric(coef(lasso_model))
)
coef_nonzero <- coef_df[coef_df$Coefficient != 0, ]
coef_nonzero_dt <- as.data.table(coef_nonzero)

# Top N SNP par valeur absolue des coefficients
N <- 50
coef_nonzero_dt[, abs_coef := abs(Coefficient)]
topN_Lasso <- coef_nonzero_dt[order(-abs_coef)][1:N, .(SNP, Coefficient)]

topN_Lasso
```

# Comparaison selection Lasso et PLS

```{r}

common_snps <- intersect(df_topNPLS$SNP, coef_nonzero$SNP)

# SNP communs
common_snps
length(common_snps)

```

# réegression ridge adaptative

```{r}
y <- as.numeric(pheno$CIRC2009)
X <- as.matrix(geno)

# Split externe train/test
set.seed(123)
train_index <- createDataPartition(y, p = 0.9, list = FALSE)
X_train <- X[train_index, ]
X_test  <- X[-train_index, ]
y_train <- y[train_index]
y_test  <- y[-train_index]

# Split interne du train en D1 et D2 : 50/50
set.seed(123)
idx_D1 <- createDataPartition(y_train, p = 0.5, list = FALSE)

X_D1 <- X_train[idx_D1, ]
X_D2 <- X_train[-idx_D1, ]
y_D1 <- y_train[idx_D1]
y_D2 <- y_train[-idx_D1]
```

### STEP I : SCREENING (Elastic Net sur D1)

```{r}
alpha <- 1 # Elastic Net si alpha = 0.5, Lasso si = 1, ridge si =0 
set.seed(123)
cv_enet <- cv.glmnet(
  X_D1, y_D1,
  alpha = alpha,               
  nfolds = 10,
  standardize = TRUE
)

lambda_hat <- cv_enet$lambda.min
```

```{r}
lambda_hat
```

```{r}
# Ajustement final 
enet_model <- glmnet(X_D1, y_D1, alpha = alpha, lambda = 0.002)


coefs <- coef(enet_model)
selected <- which(coefs[-1] != 0)          # indices des SNP sélectionnés
S_hat <- selected

cat("Nombre de variables sélectionnées (screening):", length(S_hat), "\n")

# Poids dérivés des coefficients Elastic Net
beta_hat <- as.numeric(coefs[-1])
weights <- abs(beta_hat)
weights <- weights[S_hat]
weights <- weights / max(weights)          # normalisation
```

```{r}
lambda_hat <- 0.002
```

## STEP II : CLEANING (Ridge sur D2)

```{r}
X_D2_sel <- X_D2[, S_hat, drop = FALSE]

# Pondération Ridge : mettre plus de pénalité sur les petites β_EN
ridge_penalty <- 1 / (weights + 1e-6)

set.seed(123)
cv_ridge <- cv.glmnet(
  X_D2_sel, y_D2,
  alpha = 0,                     # alpha = 0 → Ridge
  nfolds = 10,
  penalty.factor = ridge_penalty
)

lambda_ridge <- cv_ridge$lambda.min

# Ajustement final sur D2
ridge_model <- glmnet(
  X_D2_sel, y_D2,
  alpha = 0,
  lambda = lambda_ridge,
  penalty.factor = ridge_penalty
)

# Coefficients ridge finaux (sur D2)
beta_clean <- coef(ridge_model)
```

# STEP III : REFIT FINAL SUR D = D1 ∪ D2 (train complet)

```{r}
X_train_sel <- X_train[, S_hat, drop = FALSE]

ridge_final <- glmnet(
  X_train_sel, y_train,
  alpha = 0,
  lambda = lambda_ridge,
  penalty.factor = ridge_penalty
)

# Prédictions sur le test externe
X_test_sel <- X_test[, S_hat, drop = FALSE]
y_pred <- predict(ridge_final, newx = X_test_sel)

# R² externe
R2_test <- 1 - sum((y_test - y_pred)^2) / sum((y_test - mean(y_test))^2)
cat("R2 externe :", R2_test, "\n")
```
